#include "opencl_source_map.hpp" 
namespace MNN { 
const char* deconv_2d = 
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"__kernel void deconv_2d(GLOBAL_SIZE_3_DIMS\n"
" #ifdef USE_BUFFER\n"
" __global FLOAT* input,\n"
" __global FLOAT* weights,\n"
" #ifdef BIAS\n"
" __global FLOAT* bias,\n"
" #endif\n"
" __global FLOAT* output,__private const int batch,\n"
" #else\n"
" __read_only image2d_t input,\n"
" __read_only image2d_t weights,\n"
" #ifdef BIAS\n"
" __read_only image2d_t bias,\n"
" #endif\n"
" __write_only image2d_t output,\n"
" #endif\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 stride_shape,\n"
" __private const int2 align_shape,\n"
" __private const int2 padding_shape,\n"
" __private const int2 kernel_shape,\n"
" __private const int kernel_size,\n"
" __private const int in_channel_blocks,__private const int out_channel_blocks) {\n"
" const int out_channel_blocks_idx=get_global_id(0);\n"
" const int out_w_idx=get_global_id(1);\n"
" const int out_batch_height_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(out_channel_blocks_idx,out_w_idx,out_batch_height_idx);\n"
"#ifdef BIAS\n"
" #ifdef USE_BUFFER\n"
" FLOAT4 out0=vload4(out_channel_blocks_idx,bias);\n"
" #else\n"
" FLOAT4 out0=RI_F(bias,SAMPLER,(int2)(out_channel_blocks_idx,0));\n"
" #endif\n"
"#else\n"
" FLOAT4 out0=(FLOAT4)0;\n"
"#endif\n"
" const int out_b_idx=out_batch_height_idx/output_shape.x;\n"
" const int out_h_idx=out_batch_height_idx % output_shape.x;\n"
" \n"
" int kernel_start_x=max(0,(out_w_idx+align_shape.y)/stride_shape.y);\n"
" int kernel_start_y=max(0,(out_h_idx+align_shape.x)/stride_shape.x);\n"
" int deal_kernel_width=kernel_shape.y-mad24(kernel_start_x,stride_shape.y,padding_shape.y)+out_w_idx-1;\n"
" int deal_kernel_height=kernel_shape.x-mad24(kernel_start_y,stride_shape.x,padding_shape.x)+out_h_idx-1;\n"
" \n"
" \n"
" int kernel_x_0,kernel_x_1,kernel_x_2,kernel_x_3,kernel_y;\n"
" FLOAT4 in0;\n"
" FLOAT4 weights0,weights1,weights2,weights3;\n"
" for (int ic=0; ic<in_channel_blocks; ic++) {\n"
" kernel_x_0=ic << 2;\n"
" kernel_x_1=kernel_x_0+1;\n"
" kernel_x_2=kernel_x_0+2;\n"
" kernel_x_3=kernel_x_0+3;\n"
" for (int k_y=deal_kernel_height,idx_h=kernel_start_y; k_y >= 0; k_y -= stride_shape.x,idx_h++) {\n"
" #ifdef USE_BUFFER\n"
" int in_width0=kernel_start_x;\n"
" for (int k_x=deal_kernel_width; k_x >= 0; k_x -= stride_shape.y) {\n"
" kernel_y=mad24(k_y,kernel_shape.y,k_x);\n"
" kernel_y=mad24(out_channel_blocks_idx,kernel_size,kernel_y);\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,kernel_x_0,kernel_y,0]\n"
" weights0=vload4(kernel_x_0*(out_channel_blocks*kernel_shape.x*kernel_shape.y)+kernel_y,weights);\n"
" weights1=vload4(kernel_x_1*(out_channel_blocks*kernel_shape.x*kernel_shape.y)+kernel_y,weights);\n"
" weights2=vload4(kernel_x_2*(out_channel_blocks*kernel_shape.x*kernel_shape.y)+kernel_y,weights);\n"
" weights3=vload4(kernel_x_3*(out_channel_blocks*kernel_shape.x*kernel_shape.y)+kernel_y,weights);\n"
" bool outBoundry=(idx_h<0 || idx_h >= input_shape.x || kernel_start_x<0 || in_width0 >= input_shape.y);\n"
" int inp_offset=(((out_b_idx+ic*batch)*input_shape.x+idx_h)*input_shape.y+in_width0)*4;\n"
" in0=outBoundry ? (FLOAT4)0 : vload4(0,input+inp_offset);\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights1,out0);\n"
" out0=mad(in0.z,weights2,out0);\n"
" out0=mad(in0.w,weights3,out0);\n"
" in_width0++;\n"
" }\n"
" #else\n"
" int in_idy=mad24(out_b_idx,input_shape.x,idx_h);\n"
" int in_hb_value=select(in_idy,-1,idx_h<0 || idx_h >= input_shape.x);\n"
" int in_width0=kernel_start_x;\n"
" for (int k_x=deal_kernel_width; k_x >= 0; k_x -= stride_shape.y) {\n"
" kernel_y=mad24(k_y,kernel_shape.y,k_x);\n"
" kernel_y=mad24(out_channel_blocks_idx,kernel_size,kernel_y);\n"
" weights0=RI_F(weights,SAMPLER,(int2)(kernel_x_0,kernel_y));\n"
" weights1=RI_F(weights,SAMPLER,(int2)(kernel_x_1,kernel_y));\n"
" weights2=RI_F(weights,SAMPLER,(int2)(kernel_x_2,kernel_y));\n"
" weights3=RI_F(weights,SAMPLER,(int2)(kernel_x_3,kernel_y));\n"
" int in_idx=mul24(ic,input_shape.y);\n"
" int in_width_value0 = in_width0; "" in_width_value0 = "" select(in_idx + in_width_value0, -1, (in_width_value0 < 0 || in_width_value0 >= input_shape.y)); "" in0=RI_F(input,SAMPLER,(int2)(in_width_value0,in_hb_value));\n"
" out0=mad(in0.x,weights0,out0);\n"
" out0=mad(in0.y,weights1,out0);\n"
" out0=mad(in0.z,weights2,out0);\n"
" out0=mad(in0.w,weights3,out0);\n"
" in_width0++;\n"
" }\n"
" #endif\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(FLOAT4)0,(FLOAT4)6);\n"
"#endif\n"
"#ifdef USE_BUFFER\n"
" const int out_offset=(((out_b_idx+out_channel_blocks_idx*batch)*output_shape.x+out_h_idx)*output_shape.y+out_w_idx)*4;\n"
" vstore4(out0,0,output+out_offset);\n"
"#else\n"
" int out_image_width_idx=mad24(out_channel_blocks_idx,output_shape.y,out_w_idx);\n"
" WI_F(output,(int2)(out_image_width_idx,out_batch_height_idx),out0);\n"
"#endif\n"
"}\n"
"__kernel void iohw2oihw(__global const float* input_ptr,__global float* output_ptr,int plane_number,int input_channel,int output_channel) {\n"
" const int ic_index=get_global_id(0),oc_index=get_global_id(1);\n"
" if (ic_index >= input_channel || oc_index >= output_channel) {\n"
" return;\n"
" }\n"
" const int input_offset=(ic_index*output_channel+oc_index)*plane_number;\n"
" const int output_offset=(oc_index*input_channel+ic_index)*plane_number;\n"
" for (int i=0; i<plane_number; ++i) {\n"
" output_ptr[output_offset+i]=input_ptr[input_offset+i];\n"
" }\n"
"}\n"
;
}
